TIPS
Write the following commands in code editor of R Studio and run them using icon Run in R Studio
Important notes:
<-) or equals
(=). These mean the same thing.
<- arrow is the “standard” in R, so we will use
that.Look at the Environment panel (top right) as you run these commands: you will see the variables appear as you define them.
# create different type of variables, note that the name must not start with a number
# can use <- or = for assignment of values
a <- 100
b <- "hello"
c <- TRUE # can also use T as shortcut
d <- FALSE # can also use F as shortcut
# Just type the variable name to see its value
a
## [1] 100
# You can see the type of variable by using class()
class(b)
## [1] "character"
We have several ways of combining individual values:
1-dimensional:
2-dimensional:
# vector of numbers
x <- c(6,5,4)
# vector of characters
y <- c("mouse","cat","dog")
x
## [1] 6 5 4
y
## [1] "mouse" "cat" "dog"
# list of different things
# it can have ANY combination of objects you want!
# the last part is a vector
z <- list(4, "cat", TRUE, c(1,2))
z
## [[1]]
## [1] 4
##
## [[2]]
## [1] "cat"
##
## [[3]]
## [1] TRUE
##
## [[4]]
## [1] 1 2
# matrix
## 1:6 is a shortcut for a vector of all numbers from 1 to 6
A <- matrix(c(1:6), nrow=3)
A
## [,1] [,2]
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
# data frame
## can combine different vectors of different types
## these MUST be the same length
df <- data.frame(x ,y)
df
Check the size:
length(x)
## [1] 3
dim(A)
## [1] 3 2
nrow(A)
## [1] 3
Arithmetic:
x * y; \(3/4\)Boolean:
a & ba | bSorting objects: using order()
Transpose a matrix: swap the rows and columns, t() function
# arithmetic
1 + 1
## [1] 2
12/3
## [1] 4
# arithmetic on a vector or matrix
A + A # element-wise addition
## [,1] [,2]
## [1,] 2 8
## [2,] 4 10
## [3,] 6 12
A * A # element-wise multiplication
## [,1] [,2]
## [1,] 1 16
## [2,] 4 25
## [3,] 9 36
# boolean
TRUE & FALSE
## [1] FALSE
T | F # T and F are shortcuts for TRUE and FALSE
## [1] TRUE
# sorting: order() gives the order of the INDICES
x
## [1] 6 5 4
order(x) # this is the order of the indices in order to sort
## [1] 3 2 1
x[order(x)] # this is the sorted vector
## [1] 4 5 6
# sort a data frame alphabetically by the second column
df[ order(df[,2]), ]
# transpose a matrix
t(A)
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
Combine objects:
Subset objects. Note that in R, the indexing starts at 1: vec[1] gets the first item in a list, vec[2] gets the second, etc.
Set membership: %in%
# combine vectors
c( c(1,2,3), 4:8 )
## [1] 1 2 3 4 5 6 7 8
# combine lists
c( list("a","b"), list(1,c(2,3)) )
## [[1]]
## [1] "a"
##
## [[2]]
## [1] "b"
##
## [[3]]
## [1] 1
##
## [[4]]
## [1] 2 3
# combine matrices by column
cbind(A, A)
## [,1] [,2] [,3] [,4]
## [1,] 1 4 1 4
## [2,] 2 5 2 5
## [3,] 3 6 3 6
# combine matrices by row
rbind(A, A)
## [,1] [,2]
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
## [4,] 1 4
## [5,] 2 5
## [6,] 3 6
# subset a vector
x[1] # first element
## [1] 6
x[c(1,2)] # first two elements (vector of indices)
## [1] 6 5
# subset a matrix
A[1,1] # first row, first column
## [1] 1
A[1:2, 1] # first two rows, first column; this turns the result into a vector though
## [1] 1 2
A[1:2, 1, drop=F] # force R to keep it like a matrix
## [,1]
## [1,] 1
## [2,] 2
A[1:2, ] # first two rows, all columns
## [,1] [,2]
## [1,] 1 4
## [2,] 2 5
# subset a data frame
df[1] # this is still a data frame, getting first column
df[[1]] # this is a vector
## [1] 6 5 4
# set membership: %in% operator
y
## [1] "mouse" "cat" "dog"
"cat" %in% y
## [1] TRUE
"kitten" %in% y
## [1] FALSE
y %in% "cat"
## [1] FALSE TRUE FALSE
In a vector or list, we can name our elements. In a matrix or data frame, we can name the rows and columns. This is very useful for referencing values later.
Names can be used in brackets instead of numbers, e.g., vec[“value1”]. For lists and data frames you can also use $ to refer to names (refers to column names for data frames).
# naming vector x
x
## [1] 6 5 4
names(x) = c("first","second","third")
x
## first second third
## 6 5 4
x["second"] # gets element named 'second'
## second
## 5
# make a named vector
newx = c("cat"=1, "dog"=2)
newx
## cat dog
## 1 2
# assign row names and column names to df
rownames(df) = c("r1","r2","r3")
colnames(df) = c("size","animal")
df
df["r1","animal"]
## [1] "mouse"
df[c("r1","r3"), ]
df$animal # this won't work for a matrix or vector, only a data frame or a list
## [1] "mouse" "cat" "dog"
A factor is a special type of vector used to store categorical information. For example, storing group metadata. Example:
Factors are very important when we set up statistical analyses and need to define groups.
f = factor(c("WT","WT","KO1","KO1","KO2","KO2"))
f
## [1] WT WT KO1 KO1 KO2 KO2
## Levels: KO1 KO2 WT
A factor has:
# see the levels of a factor
# NOTE: this is a character vector with the unique values in f
levels(f)
## [1] "KO1" "KO2" "WT"
# change the name of the WT level
levels(f)[3] = "Wild-Type"
f
## [1] Wild-Type Wild-Type KO1 KO1 KO2 KO2
## Levels: KO1 KO2 Wild-Type
# redefine f so "WT" comes first
f = factor(c("WT","WT","KO1","KO1","KO2","KO2"),levels=c("WT","KO1","KO2"))
f
## [1] WT WT KO1 KO1 KO2 KO2
## Levels: WT KO1 KO2
# ordinal factor
f2 = factor(c("time1","time2","time3"), ordered=T)
f2
## [1] time1 time2 time3
## Levels: time1 < time2 < time3
# try to add a new value to a factor
f[6] = "KO3" ## generates an ERROR!
f ## last value is now NA (missing)
## [1] WT WT KO1 KO1 KO2 <NA>
## Levels: WT KO1 KO2
# common use case: make certain columns of a data frame factors
df[,2] = factor(df[,2])
df # looks the same
df[,2] # now we see the levels
## [1] mouse cat dog
## Levels: cat dog mouse
This is just for our practice, as the max() function already exists in R.
Note: Use indentation (tab) to make codes more human readable.
get.max <- function(x){
max <- x[1]
for (i in x){
if (i > max){
max <- i
}
}
return(max)
}
a <- c(23.3, 1, 3, 55, 6)
get.max(a)
## [1] 55
max(a) # this is the built-in max function
## [1] 55
For this we will use a built-in data frame in R, mtcars.
# Built in data frame mtcars
?mtcars # get info on mtcars data set
dim(mtcars) # get dimensions of data frame
## [1] 32 11
head(mtcars) # See first six rows
# apply function
# find the max value per feature
apply(mtcars, 2, max)
## mpg cyl disp hp drat wt qsec vs am gear
## 33.900 8.000 472.000 335.000 4.930 5.424 22.900 1.000 1.000 5.000
## carb
## 8.000
# which cars have the max features?
# here we define a function WITHIN the apply statement
# what type of object does this return?
apply(mtcars, 2, function(x) rownames(mtcars)[x==max(x)])
## $mpg
## [1] "Toyota Corolla"
##
## $cyl
## [1] "Hornet Sportabout" "Duster 360" "Merc 450SE"
## [4] "Merc 450SL" "Merc 450SLC" "Cadillac Fleetwood"
## [7] "Lincoln Continental" "Chrysler Imperial" "Dodge Challenger"
## [10] "AMC Javelin" "Camaro Z28" "Pontiac Firebird"
## [13] "Ford Pantera L" "Maserati Bora"
##
## $disp
## [1] "Cadillac Fleetwood"
##
## $hp
## [1] "Maserati Bora"
##
## $drat
## [1] "Honda Civic"
##
## $wt
## [1] "Lincoln Continental"
##
## $qsec
## [1] "Merc 230"
##
## $vs
## [1] "Datsun 710" "Hornet 4 Drive" "Valiant" "Merc 240D"
## [5] "Merc 230" "Merc 280" "Merc 280C" "Fiat 128"
## [9] "Honda Civic" "Toyota Corolla" "Toyota Corona" "Fiat X1-9"
## [13] "Lotus Europa" "Volvo 142E"
##
## $am
## [1] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Fiat 128"
## [5] "Honda Civic" "Toyota Corolla" "Fiat X1-9" "Porsche 914-2"
## [9] "Lotus Europa" "Ford Pantera L" "Ferrari Dino" "Maserati Bora"
## [13] "Volvo 142E"
##
## $gear
## [1] "Porsche 914-2" "Lotus Europa" "Ford Pantera L" "Ferrari Dino"
## [5] "Maserati Bora"
##
## $carb
## [1] "Maserati Bora"
# tapply function
# average mpg based on number of cylinders
tapply(mtcars$mpg, mtcars$cyl, mean)
## 4 6 8
## 26.66364 19.74286 15.10000
# sapply function
# check which cars have different numbers of cylinders
# first get the different cylinder counts
# unique() gives the unique entries in a vector
cyls <- unique(mtcars$cyl)
cyls
## [1] 6 4 8
# order from smallest to biggest
cyls <- cyls[order(cyls)]
cyls
## [1] 4 6 8
# car names per cylinder count
sapply(cyls, function(x) rownames(mtcars)[mtcars$cyl==x])
## [[1]]
## [1] "Datsun 710" "Merc 240D" "Merc 230" "Fiat 128"
## [5] "Honda Civic" "Toyota Corolla" "Toyota Corona" "Fiat X1-9"
## [9] "Porsche 914-2" "Lotus Europa" "Volvo 142E"
##
## [[2]]
## [1] "Mazda RX4" "Mazda RX4 Wag" "Hornet 4 Drive" "Valiant"
## [5] "Merc 280" "Merc 280C" "Ferrari Dino"
##
## [[3]]
## [1] "Hornet Sportabout" "Duster 360" "Merc 450SE"
## [4] "Merc 450SL" "Merc 450SLC" "Cadillac Fleetwood"
## [7] "Lincoln Continental" "Chrysler Imperial" "Dodge Challenger"
## [10] "AMC Javelin" "Camaro Z28" "Pontiac Firebird"
## [13] "Ford Pantera L" "Maserati Bora"
bw_data <- read.delim("https://wd.cri.uic.edu/R/birth_weight.txt")
head(bw_data)
data_ordered_by_age <- bw_data[order(bw_data$age), ]
head(data_ordered_by_age)
write.table(data_ordered_by_age,"birth_weight_ordered_by_age.txt",sep="\t",
quote=F,row.names=F)
In R Studio you can select Packages tab in lower
right pane to view all installed packages. Also possible to list via R command.
# List all installed packages
installed.packages()
Check if select packages are installed.
c("ggplot2", "ComplexHeatmap") %in% rownames(installed.packages())
## [1] FALSE FALSE
install.packages("BiocManager”)
BiocManager::install("ComplexHeatmap", update=F)
library(ComplexHeatmap)
If you are asked for a CRAN mirror site, please select one which is close to your location: e.g. “USA (IN)” Tidyverse is a collection of packages, including tidyr, dplyr, and ggplot2. Technically each of these are dependencies for the tidyverse package, and so will get installed also.
install.packages("tidyverse")
# load the package
library(tidyr)
library(dplyr)
library(ggplot2)
NOTE: Pipes (%>%) to head() are just so that we see the first 6 lines of each output.
# Load the tidyverse
library(tidyverse)
# some exercises with mtcars
head(mtcars)
# Move car names to a column
data <- mtcars %>%
rownames_to_column(var = "car")
head(data)
# Example 1: Select only mpg, cyl, and hp
# pipe to head() at the end to see the first few lines
data %>%
select(car, mpg, cyl, hp) %>% head()
# Example 2: Rename columns for clarity
data %>%
rename(miles_per_gallon = mpg,
horsepower = hp) %>% head()
# Example 3: Filter cars with mpg greater than 25
data %>%
filter(mpg > 25) %>% head()
# Example 4: Filter cars with 4 or 6 cylinders
data %>%
filter(cyl %in% c(4, 6)) %>% head()
# Example 5: Create a new variable called power_to_weight
data %>%
mutate(power_to_weight = hp / wt) %>% head()
# Example 6: Use `case_when` to categorize cars by engine size
data %>%
mutate(engine_size = case_when(
disp < 150 ~ "small",
disp < 300 ~ "medium",
TRUE ~ "large"
)) %>% head()
# Example 7: Average mpg by number of cylinders
data %>%
group_by(cyl) %>%
summarise(mean_mpg = mean(mpg))
# Example 8: Count cars per gear group
data %>%
count(gear)
# Example 9: Pivot mpg, hp, and wt into long format
## note: showing a tibble will only show the first few rows
## don't need to use head()
data_long <- data %>%
pivot_longer(cols = c(mpg, hp, wt),
names_to = "variable",
values_to = "value")
data_long
# Example 10: Pivot back to wide format
data_long %>%
pivot_wider(names_from = variable, values_from = value)
# Example 11: Introduce some NA values
data_with_na <- data %>%
mutate(mpg = replace(mpg, mpg < 20, NA))
head(data_with_na)
# Example 12: Drop rows with NA
data_with_na %>%
drop_na() %>% head()
# Example 13: Replace NA with a fixed value
data_with_na %>%
replace_na(list(mpg = 0)) %>% head()
Read in the data for this exercise. This is a table of cell metadata and selected gene expression levels from a single-cell RNA-seq project. The columns in this table are:
sc <- read.delim("http://wd.cri.uic.edu/R/scRNA_cells.txt",row.names=1)
# convert the Cluster column to a factor
sc$Cluster = factor(sc$Cluster)
head(sc)
We will make various versions of a UMAP plot.
# basic scatterplot
plot(sc$UMAP_1, sc$UMAP_2)
# colored by cluster - this is kind of clunky
cell_colors = factor(sc$Cluster)
levels(cell_colors) = rainbow(length(levels(cell_colors)))
plot(sc$UMAP_1, sc$UMAP_2, col=cell_colors)
Note on the above syntax: we’re using factors as a shortcut to make unique colors for the clusters:
rainbow function gives an even spacing of colors across the
rainbow for a given number of colors.It would be even better to add a legend. This is possible in base R plots, but much easier with ggplot2.
ggplot2 makes it much easier to add more features to our plot, like automatically coloring dots based on a categorical label and adding a legend.
library(ggplot2)
ggplot(sc, aes(x = UMAP_1, y = UMAP_2, color = Cluster)) +
geom_point()
# to save as a PDF:
pdf("basic_UMAP_plot.pdf")
ggplot(sc, aes(x = UMAP_1, y = UMAP_2, color = Cluster)) +
geom_point()
dev.off()
facet_wrap)If you do not see a plot appear, make sure you have run dev.off() above.
ggplot(sc, aes(x = UMAP_1, y = UMAP_2, color = Cluster)) +
geom_point() +
facet_wrap( ~ Genotype)
facet_grid)ggplot(sc, aes(x = UMAP_1, y = UMAP_2, color = Cluster)) +
geom_point() +
facet_grid( Batch ~ Genotype)
ggplot(sc, aes(x = UMAP_1, y = UMAP_2, color = Cluster)) +
geom_point() +
theme_classic() +
labs(x="UMAP1", y="UMAP2", color="Cell type", title="UMAP by cluster")
ggplot(sc, aes(x = UMAP_1, y = UMAP_2, color = Cxcr6)) +
geom_point()
# nicer coloring with viridis palette
ggplot(sc, aes(x = UMAP_1, y = UMAP_2, color = Cxcr6)) +
geom_point() +
scale_color_viridis_c()
ggplot(sc, aes(x = UMAP_1, y = UMAP_2, color = Genotype)) +
geom_point() +
scale_color_manual(values=c("WT"="red","KO"="blue"))
These are both ways to plot the distribution of values.
# as a boxplot
ggplot(sc, aes(x=Cluster, y=Cxcr6, fill=Genotype)) +
geom_boxplot()
# as a violin plot
ggplot(sc, aes(x=Cluster, y=Cxcr6, fill=Genotype)) +
geom_violin()
# with variables!
baseplot <- ggplot(sc, aes(x=Cluster, y=Cxcr6, fill=Genotype))
baseplot + geom_boxplot()
baseplot + geom_violin()
This is an example of showing expression for many marker genes over all clusters in a compact visualization.
# make a new data frame with just the clusters, and the gene expression levels
gene_expr <- sc[,c(7,10:17)]
head(gene_expr)
# convert to the "long" format. Load the tidyr library if you haven't already.
library(tidyr)
gene_expr_long <- pivot_longer(gene_expr, !Cluster)
gene_expr_long
# basic violin plot
# we're using the faceting to show the different groups,
# rather than separating along the x-axis
basic_vioplot <- ggplot(gene_expr_long, aes(x=NA,y=value,fill=Cluster)) +
geom_violin() + facet_grid(Cluster ~ name)
basic_vioplot
# make it look nicer
basic_vioplot +
theme_void() +
coord_flip() +
guides(fill="none") +
theme(strip.text.x = element_text(angle=45))
Where:
theme_void() removes the x and y axis tics, labels, and
the background grid.coord_flip() makes the violins go left-to-right instead
of up and down.guides(fill="none") turns off the legend for the fill
color, as this is redundant with the cluster facet labels.theme(strip.text.x = element_text(angle=45)) rotates
the x-direction facet labels (gene names) by 45 degrees.# using geom_bar: let ggplot2 count for you
ggplot(sc, aes(x=Cluster)) + geom_bar()
# do the count first and use geom_col()
cluster_counts <- table(sc$Cluster)
cluster_counts
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13
## 949 891 809 718 687 566 526 486 310 303 212 179 106 97
cluster_counts_df <- data.frame(cluster_counts)
cluster_counts_df
ggplot(cluster_counts_df, aes(x=Var1, y=Freq)) + geom_col()
# stacked barplot
ggplot(sc, aes(x=Cluster, fill=Genotype)) +
geom_bar()
# side-by-side barplot
ggplot(sc, aes(x=Cluster, fill=Genotype)) +
geom_bar(position="dodge")
# facet by TCR status
ggplot(sc, aes(x=Cluster, fill=Genotype)) +
geom_bar(position="dodge") +
facet_wrap(~HasTCR)
The test data set are the top 100 differentially expressed genes (DEGs) from an RNA-seq project with 3 groups (Control, disease model 1, disease model 2). The values in this table are log2 CPM (log-scaled normalized expression).
library(ComplexHeatmap)
degs <- read.delim("http://wd.cri.uic.edu/R/degs.txt",row.names=1)
# look at the top of the file
head(degs)
# z-score expression
degs.z <- t(scale(t(degs)))
# plot a heatmap
Heatmap(degs.z, name="Z-score")
# turn off row names
Heatmap(degs.z, name="Z-score", show_row_names=F)
# add a color label for the groups
groups <- gsub("\\.[0-9]*$","",colnames(degs))
groups
## [1] "Control" "Control" "Control" "Control" "Model1" "Model1" "Model1"
## [8] "Model1" "Model2" "Model2" "Model2" "Model2"
group_colors <- c("Control"="blue","Model1"="orange","Model2"="purple")
column_label <- HeatmapAnnotation(df=data.frame(Group=groups),
col=list(Group=group_colors), which="column")
Heatmap(degs.z, name="Z-score", show_row_names=F, top_annotation=column_label)
We’ll the same RNA-seq data as above. NOTE: usually you would compute PCA over all genes, rather than just the top ones as in this example.
pca <- prcomp(t(degs))
# We can use the summary() function to calculate summary statistics on the pca
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 20.0719 3.28916 1.09160 1.02606 0.90698 0.80002 0.71785
## Proportion of Variance 0.9614 0.02582 0.00284 0.00251 0.00196 0.00153 0.00123
## Cumulative Proportion 0.9614 0.98720 0.99004 0.99256 0.99452 0.99605 0.99728
## PC8 PC9 PC10 PC11 PC12
## Standard deviation 0.69225 0.53578 0.47392 0.38804 2.653e-15
## Proportion of Variance 0.00114 0.00069 0.00054 0.00036 0.000e+00
## Cumulative Proportion 0.99842 0.99910 0.99964 1.00000 1.000e+00
# This can be stored in an R object as a list
pca_stats <- summary(pca)
names(pca_stats) # Shows the elements in the list
## [1] "sdev" "rotation" "center" "scale" "x"
## [6] "importance"
importance <- pca_stats$importance # Get information on PC importance
importance
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 20.07188 3.289157 1.091597 1.026061 0.9069831 0.8000232
## Proportion of Variance 0.96138 0.025820 0.002840 0.002510 0.0019600 0.0015300
## Cumulative Proportion 0.96138 0.987200 0.990040 0.992560 0.9945200 0.9960500
## PC7 PC8 PC9 PC10 PC11
## Standard deviation 0.7178452 0.6922487 0.5357828 0.4739224 0.388043
## Proportion of Variance 0.0012300 0.0011400 0.0006900 0.0005400 0.000360
## Cumulative Proportion 0.9972800 0.9984200 0.9991000 0.9996400 1.000000
## PC12
## Standard deviation 2.653185e-15
## Proportion of Variance 0.000000e+00
## Cumulative Proportion 1.000000e+00
# add the group metadata to the PC coordinates
pca_coords = data.frame(pca$x)
head(pca_coords)
pca_coords$Group = groups
# PCA plot
# add the % explained variance for each PC in the axis labels
ggplot(data = pca_coords, aes(x=PC1, y=PC2, color=Group)) +
geom_point() +
theme_classic() +
labs(x = paste0("PC1 (", round(100*importance[2,1], 1), "%)"),
y = paste0("PC2 (", round(100*importance[2,2], 1), "%)"))
# scree plot
screeplot(pca)
NOTE: although all three groups look well-separated in the PC1 vs PC2 plot, note that the biggest differences are with respect to Model1: this is separated along PC1, which explains the vast majority of the variance. This is consistent with what we saw in the heatmap.
For this we’ll use the ggrepel package, which does a good job of fitting text labels into a plot.
Install the package first:
install.packages("ggrepel")
library(ggrepel)
# add the sample names to our data frame
pca_coords$Sample = rownames(pca_coords)
# PCA plot, with label aesthetics and geom_text_repel layer
# set show legend=F to avoid the duplicate legend for text colors
ggplot(data = pca_coords, aes(x=PC1, y=PC2, color=Group, label=Sample)) +
geom_point() +
theme_classic() +
geom_text_repel(show.legend=F) +
labs(x = paste0("PC1 (", round(100*importance[2,1], 1), "%)"),
y = paste0("PC2 (", round(100*importance[2,2], 1), "%)"))
We will use the birth weight data from earlier today. Read that back in if needed.
bw_data <- read.delim("https://wd.cri.uic.edu/R/birth_weight.txt")
Caution: a large sample size will usually result in a significant p-value from Shapiro test, because any tiny deviation from normality will be detected. We should also look at how far W is from 1 to see how big the deviation is. For reasonably small differences, the normality assumption is probably still OK.
length(bw_data$bwt)
## [1] 1174
hist(bw_data$bwt)
shapiro.test(bw_data$bwt)
##
## Shapiro-Wilk normality test
##
## data: bw_data$bwt
## W = 0.99563, p-value = 0.001917
# use QQ-normal plot to check the normality.
# if the data is normally distributed, the points in the QQ-normal plot will
# lie on a straight diagonal line.
qqnorm(bw_data$bwt)
# qqline shows a line for a "theoretical" normal distribution
qqline(bw_data$bwt, col="red")
# alternative: test on raw counts from RNA-seq
# get raw read count data
raw.count <- read.delim("http://wd.cri.uic.edu/advanced_R/raw.count.txt",row.names=1)
# convert to the gene's raw read count to a numeric vector
count <- as.numeric(raw.count[1, ])
# there are 46 data points
length(count)
## [1] 46
# check if it is a bell shape in the histogram of raw read counts
hist(count)
shapiro.test(count)
##
## Shapiro-Wilk normality test
##
## data: count
## W = 0.49027, p-value = 2.014e-11
qqnorm(count)
qqline(count, col="red")
Birth weight values are close to normal. The RNA-seq values are not close to normal.
Test birth weight vs mother’s smoking status.
bw_data$smoke <- factor(bw_data$smoke)
# t-test
t.test(bwt ~ smoke, data=bw_data)
##
## Welch Two Sample t-test
##
## data: bwt by smoke
## t = 8.6265, df = 941.81, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 7.158132 11.374153
## sample estimates:
## mean in group 0 mean in group 1
## 123.0853 113.8192
# the result is a list
# we can access values with $
ttest_result <- t.test(bwt ~ smoke, data=bw_data)
ttest_result
##
## Welch Two Sample t-test
##
## data: bwt by smoke
## t = 8.6265, df = 941.81, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 7.158132 11.374153
## sample estimates:
## mean in group 0 mean in group 1
## 123.0853 113.8192
ttest_result$p.value
## [1] 2.656464e-17
ttest_result$conf.int
## [1] 7.158132 11.374153
## attr(,"conf.level")
## [1] 0.95
# wilcox test
wilcox.test(bwt ~ smoke, data=bw_data)
##
## Wilcoxon rank sum test with continuity correction
##
## data: bwt by smoke
## W = 212970, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
# visualization
ggplot(bw_data, aes(x=smoke, y=bwt)) +
geom_boxplot()
Test birth weight against categorized values of gestation.
# reload dpylr if necessary
library(dplyr)
# categories for gestational time
# based on number of weeks:
# <37 weeks: pre-term
# 37-41 weeks: full-term
# >41 weeks: late term
bw_data <- bw_data %>%
mutate(gestation_cat = case_when(
gestation < 37*7 ~ "pre term",
gestation < 41*7 ~ "full term",
TRUE ~ "late term"
))
# write as a factor, specifying level order
bw_data$gestation_cat = factor(bw_data$gestation_cat,
levels=c("pre term","full term","late term"))
# anova
anova <- aov(bwt ~ gestation_cat, data=bw_data)
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## gestation_cat 2 61546 30773 108.4 <2e-16 ***
## Residuals 1171 332512 284
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# to access the p-values directly:
anova.stats <- summary(anova)
anova.pvalues <- anova.stats[[1]][["Pr(>F)"]]
## NOTE: "NA" corresponds to the residuals row
anova.pvalues
## [1] 6.571396e-44 NA
# diagnostic plots for anova
plot(anova)
# kruskal wallis
kruskal.test(bwt ~ gestation_cat, data=bw_data)
##
## Kruskal-Wallis rank sum test
##
## data: bwt by gestation_cat
## Kruskal-Wallis chi-squared = 146.76, df = 2, p-value < 2.2e-16
# visualization
ggplot(bw_data, aes(x=gestation_cat, y=bwt)) +
geom_boxplot()
Compare birth weight to categorized gestation and smoking
# make sure smoking is a factor
bw_data$smoke <- factor(bw_data$smoke)
# without interaction
summary(aov(bwt ~ gestation_cat + smoke, data=bw_data))
## Df Sum Sq Mean Sq F value Pr(>F)
## gestation_cat 2 61546 30773 114.97 <2e-16 ***
## smoke 1 19338 19338 72.25 <2e-16 ***
## Residuals 1170 313174 268
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# with interaction
## note that gestation_cat:smoke is the interaction term
summary(aov(bwt ~ gestation_cat * smoke, data=bw_data))
## Df Sum Sq Mean Sq F value Pr(>F)
## gestation_cat 2 61546 30773 115.609 <2e-16 ***
## smoke 1 19338 19338 72.649 <2e-16 ***
## gestation_cat:smoke 2 2272 1136 4.268 0.0142 *
## Residuals 1168 310902 266
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# boxplot
ggplot(bw_data, aes(x=gestation_cat, fill=smoke, y=bwt)) +
geom_boxplot()
It appears that the effect of smoking is strongest for earlier term pregnancies. The significance of the dependence of smoking effect on pregnancy term is captured in the interaction term of the model.
Comparison of mother’s height and weight.
# correlation
cor(bw_data$height, bw_data$weight)
## [1] 0.4352874
# correlation test
cor.test(bw_data$height, bw_data$weight)
##
## Pearson's product-moment correlation
##
## data: bw_data$height and bw_data$weight
## t = 16.552, df = 1172, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3877305 0.4805332
## sample estimates:
## cor
## 0.4352874
# using non-parametric correlation coefficients
cor.test(bw_data$height, bw_data$weight, method="spearman")
##
## Spearman's rank correlation rho
##
## data: bw_data$height and bw_data$weight
## S = 134713533, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.5004735
cor.test(bw_data$height, bw_data$weight, method="kendall")
##
## Kendall's rank correlation tau
##
## data: bw_data$height and bw_data$weight
## z = 18.02, p-value < 2.2e-16
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.3743995
# plot
ggplot(bw_data, aes(x=height, y=weight)) +
geom_point()
This time compare birth weight to gestation (comparison already done as categorized gestation), and other variables.
# simple linear regression
summary(lm(bwt ~ gestation, data=bw_data))
##
## Call:
## lm(formula = bwt ~ gestation, data = bw_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -49.348 -11.065 0.218 10.101 57.704
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10.75414 8.53693 -1.26 0.208
## gestation 0.46656 0.03054 15.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.74 on 1172 degrees of freedom
## Multiple R-squared: 0.1661, Adjusted R-squared: 0.1654
## F-statistic: 233.4 on 1 and 1172 DF, p-value: < 2.2e-16
ggplot(bw_data, aes(x=gestation, y=bwt)) +
geom_point()
# add best-fit line
# se=T -> add standard error bars
ggplot(bw_data, aes(x=gestation, y=bwt)) +
geom_point() +
geom_smooth(method="lm", se=T)
## `geom_smooth()` using formula = 'y ~ x'
# multiple regression
## additive model
summary(lm(bwt ~ gestation + smoke + weight, data=bw_data))
##
## Call:
## lm(formula = bwt ~ gestation + smoke + weight, data = bw_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.920 -10.759 -0.279 9.743 51.354
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.62648 8.69128 -2.028 0.0428 *
## gestation 0.44809 0.02936 15.261 < 2e-16 ***
## smoke1 -8.07789 0.96444 -8.376 < 2e-16 ***
## weight 0.11818 0.02267 5.213 2.2e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.07 on 1170 degrees of freedom
## Multiple R-squared: 0.2335, Adjusted R-squared: 0.2315
## F-statistic: 118.8 on 3 and 1170 DF, p-value: < 2.2e-16
## using the full model with all interactions
summary(lm(bwt ~ gestation * smoke * weight, data=bw_data))
##
## Call:
## lm(formula = bwt ~ gestation * smoke * weight, data = bw_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.657 -10.597 -0.062 9.738 47.653
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.118e+02 7.032e+01 -1.590 0.11208
## gestation 7.857e-01 2.514e-01 3.125 0.00182 **
## smoke1 2.711e+01 1.093e+02 0.248 0.80409
## weight 9.945e-01 5.297e-01 1.877 0.06070 .
## gestation:smoke1 -1.197e-01 3.912e-01 -0.306 0.75965
## gestation:weight -3.140e-03 1.895e-03 -1.657 0.09777 .
## smoke1:weight -7.161e-01 8.342e-01 -0.858 0.39082
## gestation:smoke1:weight 2.520e-03 2.984e-03 0.845 0.39848
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.99 on 1166 degrees of freedom
## Multiple R-squared: 0.2431, Adjusted R-squared: 0.2386
## F-statistic: 53.51 on 7 and 1166 DF, p-value: < 2.2e-16
# note that overall explained variance is about the same
# but with more terms in the model the explanatory value
# of each variable is lower
# to test the EFFECT of each variable, using anova:
summary(aov(bwt ~ gestation * smoke * weight, data=bw_data))
## Df Sum Sq Mean Sq F value Pr(>F)
## gestation 1 65450 65450 255.871 < 2e-16 ***
## smoke 1 19533 19533 76.365 < 2e-16 ***
## weight 1 7015 7015 27.425 1.94e-07 ***
## gestation:smoke 1 3069 3069 11.997 0.000552 ***
## gestation:weight 1 538 538 2.104 0.147137
## smoke:weight 1 18 18 0.072 0.788345
## gestation:smoke:weight 1 182 182 0.713 0.398481
## Residuals 1166 298252 256
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# use different plots for comparison of variables
ggplot(bw_data, aes(x=gestation, y=bwt, color=smoke)) +
geom_point() +
geom_smooth(method="lm", se=T)
## `geom_smooth()` using formula = 'y ~ x'
ggplot(bw_data, aes(x=weight, y=bwt, color=smoke)) +
geom_point() +
geom_smooth(method="lm", se=T)
## `geom_smooth()` using formula = 'y ~ x'
Test for an association of smoking and parity.
# make sure both are factors
bw_data$smoke <- factor(bw_data$smoke)
bw_data$parity <- factor(bw_data$parity)
smoke_vs_parity <- bw_data[, c("smoke","parity")]
head(smoke_vs_parity)
# make contingency table with table()
smoke_vs_parity_table <- table(smoke_vs_parity)
smoke_vs_parity_table
## parity
## smoke 0 1
## 0 525 190
## 1 341 118
fisher.test(smoke_vs_parity_table)
##
## Fisher's Exact Test for Count Data
##
## data: smoke_vs_parity_table
## p-value = 0.7857
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.7245511 1.2590005
## sample estimates:
## odds ratio
## 0.9561918
# compute log2 odds ratio
fet <- fisher.test(smoke_vs_parity_table)
log2(fet$estimate)
## odds ratio
## -0.0646281
# barplots
# plot the total counts
ggplot(smoke_vs_parity, aes(x=smoke, fill=parity)) +
geom_bar()
# plot the proportions - this is what we're testing
ggplot(smoke_vs_parity, aes(x=smoke, fill=parity)) +
geom_bar(position="fill") +
labs(y="Fraction")
No effect observed here.
Test for association of smoking and gestation category
smoke_vs_gest <- bw_data[,c("smoke","gestation_cat")]
smoke_vs_gest_table <- table(smoke_vs_gest)
chisq.test(smoke_vs_gest_table)
##
## Pearson's Chi-squared test
##
## data: smoke_vs_gest_table
## X-squared = 9.8147, df = 2, p-value = 0.007392
# barplot
ggplot(smoke_vs_gest, aes(x=smoke, fill=gestation_cat)) +
geom_bar(position="fill") +
labs(y="Fraction")
We have a significant difference from Chi-square. Which groups show the differences? Visually it looks like the full term and late term groups have different proportions of smokers.
Test for individual differences with Fisher’s Exact test.
# store groups to run tests over
cols <- ncol(smoke_vs_gest_table)
# start an empty data frame
term_stats <- data.frame()
# run a test for each column (term group) one at a time
for(i in 1:cols){
# get the counts for this term group
term.counts <- smoke_vs_gest_table[,i]
# get the counts for all other term groups
term.other <- rowSums(smoke_vs_gest_table[,-i])
# we're specifically seeing if THIS term group has a different distribution
# of smoking vs non-smoking mothers, using fisher's exact test
term.table <- cbind(term.counts, term.other)
fet <- fisher.test(term.table)
# combine the estimate (odds ratio) and p-value) into the data frame
# and use the term group as the row name
term_stats <- rbind(term_stats, c(fet$estimate, fet$p.value) )
rownames(term_stats)[nrow(term_stats)] <- colnames(smoke_vs_gest_table)[i]
}
# set the column names, and add log2-scaled odds ratio and FDR corrected p-value
colnames(term_stats) <- c("OddsRatio","P.Value")
term_stats$Log2OddsRatio = log2(term_stats$OddsRatio)
term_stats$Q.Value = p.adjust(term_stats$P.Value,method="fdr")
term_stats
We see statistically significant differences in mother smoking frequency for full term and late term.
We will discuss the FDR correction (used above) next.
wt (size=20) with mean 10 and
standard deviation 3.ko (size=20) with mean 10 and
standard deviation 3.wt is different from
ko, and get the p-value for the test.NOTE: rnorm simulates n values (below, 20) from the
normal distribution with a given mean and standard deviation.
# start empty vector
pval <- c()
# generate 10k random data sets from the
# SAME normal distribution
for (i in 1:10000){
wt <- rnorm(20, mean=10, sd=3)
ko <- rnorm(20, mean=10, sd=3)
pval[i] <- t.test(wt, ko)$p.value
}
# how many p-values are <0.05?
sum(pval<0.05)
## [1] 488
summary(pval)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000456 0.2523210 0.4959265 0.5006591 0.7507688 0.9998409
# FDR correction
fdr <- p.adjust(pval, method="fdr")
sum(fdr<0.05)
## [1] 0
summary(fdr)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.4562 0.9868 0.9868 0.9907 0.9992 0.9998
NOTE: If you have not previously installed openxlsx then
install from CRAN.
install.packages("openxlsx")
openxlsxlibrary(openxlsx)
sheet_1 <- read.xlsx("http://wd.cri.uic.edu/advanced_R/taxa_relative.xlsx", sheet = 1)
sheet_1[1:10, 1:5]
sheet_L6 <- read.xlsx("http://wd.cri.uic.edu/advanced_R/taxa_relative.xlsx", sheet = "L6")
sheet_L6[1:10, 1:5]
bw_data <- read.delim("https://wd.cri.uic.edu/R/birth_weight.txt")
# Create a workbook with two empty sheets
wb <- createWorkbook()
addWorksheet(wb, "birth weight")
# Write the birth weight data to Excel
writeData(wb, sheet="birth weight", x=bw_data)